library(readxl)
library(dplyr)
library(tidyr)
library(stringr)
library(tidyverse)
library(DT)
library(patchwork)
library(ggthemes)
library(plotly)
library(gapminder)
library(grid)
library(ggplot2)
library(gridExtra)
library(maps)
library(usmap)
# install.packages("DescTools")
# install.packages("car")
# install.packages("rpart")
# install.packages("rpart.plot")
library(DescTools)
library(car)
library(rpart)
library(rpart.plot)
Exploring the Impact of COVID-19 on State-Level Economic Indicators and housing price in the U.S
This project is to explore how the COVID-19 pandemic affected economic conditions, focusing on housing price across U.S. states. Although the final goal is to analyze which factors contributed to post-COVID changes in housing prices, it also contains essential works: collecting, cleaning, visualizing and statistical analysis.
Includes two datasets: one providing nationwide annual COVID case totals and another offering state-by-state case counts from 2020 to 2023. These datasets help track the spread and intensity of the pandemic across different regions and time periods, offering important context for analyzing trends.
Source: Centers for Disease Control and Prevention (CDC) (https://covid.cdc.gov/COVID-DATA-TRACKER/#datatracker-home) & USA Facts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/)
File name: cases_deaths.csv & covid_confirmed_usafacts.csv
A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. Available as a smoothed, seasonally adjusted measure and as a raw measure.
Source: Zillow Research (https://www.zillow.com/research/data/)
File name: State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
This dataset shows monthly unemployment rates by U.S. state. It is part of the Local Area Unemployment Statistics (LAUS) program by the Bureau of Labor Statistics (BLS). The data is seasonally adjusted and helps track labor market trends at the state level over time.
Source :U.S. Bureau of Labor Statistics (https://www.bls.gov/charts/state-employment-and-unemployment/state-unemployment-rates-animated.htm)
File name: unemployment rate.xlxs
Reports annual state-level PCE data, measuring average household spending across different regions.
Source: Bureau of Economic Analysis (BEA) (https://apps.bea.gov/regional/downloadzip.htm)
File name: SAPCE1__ALL_AREAS_1997_2023.csv
Includes annual personal income per capita by state. Useful for assessing economic status and regional income disparities. Source: Bureau of Economic Analysis (BEA) File name: SAINC1__ALL_AREAS_1929_2024.csv → INCOME.csv
Provides real GDP data by state from 1997 to 2024. This helps analyze economic output and growth across regions.
Source: Bureau of Economic Analysis (BEA)
File name: SAGDP1__ALL_AREAS_1997_2024.csv
Includes per capita credit card debt and total household debt across states. Used to assess consumer financial health.
| Sheet | Title |
|---|---|
| population | Number of Consumers in New York Fed Consumer Credit Panel |
| auto loan | Auto Debt Balance per Capita |
| credit card | Credit Card Debt Balance per Capita |
| mortgage | Mortgage Debt Balance per Capita (excluding HELOC) |
| student loan | Student Loan Debt Balance per Capita |
| total | Total Debt Balance per Capita |
| auto loan delinquency | Percent of Auto Debt Balance 90+ Days Delinquent |
| credit card delinquency | Percent of Credit Card Debt Balance 90+ Days Delinquent |
| mortgage delinquency | Percent of Mortgage Debt Balance 90+ Days Delinquent |
| student loan delinquency | Percent of Student Loan Debt Balance 90+ Days Delinquent (and in default) |
Source: New York Fed (https://www.newyorkfed.org/microeconomics/databank.html) File name: area_report_by_year.xlsx → NYFED.csv
covid <- read_csv("data/cases_deaths.csv")
#View(covid)
covid_US <- covid %>%
filter(country == "United States") %>%
select(country, date, new_cases, total_cases, weekly_cases) %>%
mutate(date = as.Date(date)) %>%
mutate(year = year(date), month = month(date, label = TRUE, abbr = FALSE)) %>%
group_by(year, month) %>%
summarise(
monthly_new_cases = sum(new_cases, na.rm = TRUE),
latest_total_cases = max(total_cases, na.rm = TRUE),
total_weekly_cases = sum(weekly_cases, na.rm = TRUE),
.groups = "drop"
)
covid_US
## # A tibble: 145 × 5
## year month monthly_new_cases latest_total_cases total_weekly_cases
## <dbl> <ord> <dbl> <dbl> <dbl>
## 1 1 January 820465 103436829 4160575
## 2 1 February 701457 103436829 4418032
## 3 1 March 531503 103436829 4672369
## 4 1 April 616176 103436829 4930142
## 5 1 May 1107849 103436829 5419154
## 6 1 June 1378542 103436829 5839205
## 7 1 July 945231 103436829 6101223
## 8 1 August 1056186 103436829 6336944
## 9 1 September 1122716 103436829 6758203
## 10 1 October 740848 103436829 6967548
## # ℹ 135 more rows
covidbystate <-read_csv("data/covid_confirmed_usafacts.csv")
covidStatesDaily <- covidbystate %>%
group_by(State) %>%
summarize(across(where(is.numeric), sum, na.rm = TRUE))
#covidStatesDaily
covidstates <- covidStatesDaily %>%
pivot_longer(
cols = -State,
names_to = "date",
values_to = "cases"
) %>%
filter(str_detect(date, "^\\d{4}-\\d{2}-\\d{2}$")) %>%
mutate(
date = ymd(date),
year = year(date)
) %>%
group_by(State, year) %>%
summarise(total_cases = max(cases, na.rm = TRUE), .groups = "drop")
#covidstates
case_summary <- summary(covidstates$total_cases)
#case_summary
covidstates <- covidstates %>%
mutate(
severity = case_when(
total_cases <= case_summary[1] ~ "Very low",
total_cases <= case_summary[2] ~ "Low",
total_cases <= case_summary[3] ~ "Medium",
total_cases <= case_summary[5] ~ "High",
TRUE ~ "Very high"
)
)
covidstates <- covidstates %>%
mutate(severity = factor(severity, levels = c("Very low", "Low", "Medium", "High", "Very high")))
covidstates
## # A tibble: 204 × 4
## State year total_cases severity
## <chr> <dbl> <dbl> <fct>
## 1 AK 2020 46304 Low
## 2 AK 2021 149907 Low
## 3 AK 2022 277884 Low
## 4 AK 2023 287319 Low
## 5 AL 2020 361226 Medium
## 6 AL 2021 896614 High
## 7 AL 2022 1568934 High
## 8 AL 2023 1659936 High
## 9 AR 2020 225138 Low
## 10 AR 2021 562455 Medium
## # ℹ 194 more rows
house <- read_csv("data/State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
house1 <- house %>%
rename(State = RegionName) %>%
select(-StateName, -RegionID, -SizeRank, -RegionType)
house_long <- house1 %>%
pivot_longer(cols = -State, names_to = "Date", values_to = "ZHVI")
#house_long
house_long1 <- house_long %>%
filter(str_sub(Date, 1, 4) %in% c("2017", "2018", "2019", "2020", "2021", "2022"))
#house_long1
house_long2 <- house_long1 %>%
mutate(year = as.numeric(str_sub(Date, 1, 4)))
house_avg <- house_long2 %>%
group_by(State, year) %>%
summarise(mean_zhvi = mean(ZHVI, na.rm = TRUE), .groups = "drop")
#house_avg
house2 <- house_avg %>%
pivot_wider(names_from = year, values_from = mean_zhvi)
#house2
house3 <- house2 %>%
mutate(
zhvi_pre_avg = rowMeans(select(., `2017`, `2018`, `2019`), na.rm = TRUE),
zhvi_post_avg = rowMeans(select(., `2020`, `2021`, `2022`), na.rm = TRUE),
zhvi_pre_change = ((`2019` - `2017`) / `2017`)*100,
zhvi_post_change = ((`2022` - `2020`) / `2020`)*100
) %>%
select(State, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change)
#house3
state_codes <- read.csv("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")
house4 <- house3 %>%
left_join(state_codes, by = c("State" = "State"))
house_final <- house4 %>%
select(Abbreviation, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change) %>%
rename(State = Abbreviation)
house_final
## # A tibble: 51 × 5
## State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AL 148993. 192427. 9.76 28.0
## 2 AK 292351. 335028. 7.66 12.3
## 3 AZ 252522. 368418. 15.0 49.4
## 4 AR 140966. 175352. 7.83 26.1
## 5 CA 521016. 663241. 12.7 31.3
## 6 CO 375678. 486205. 12.6 32.8
## 7 CT 251600. 305414. 3.63 28.0
## 8 DE 262137. 318745. 5.58 27.9
## 9 DC 560797. 633072. 8.25 7.84
## 10 FL 228291. 308956. 13.2 45.6
## # ℹ 41 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(house_final, "data/data-clean/HOUSING.csv", row.names = FALSE)
unemploy <- read_excel("data/unemployment rate.xlsx", range = "A1:DR53")
names(unemploy)[-1] <- as.character(as.Date(as.numeric(names(unemploy)[-1]), origin = "1899-12-30"))
pre_df <- unemploy %>%
select(State, `2017-12-01`, `2019-12-01`) %>%
# Some is chr so we need to numeric
mutate(across(-State, ~ as.numeric(.))) %>%
mutate(unemp_pre = `2019-12-01` - `2017-12-01`) %>%
rename(unemp_19 = `2019-12-01`, unemp_17=`2017-12-01`) %>%
select(State, unemp_pre, unemp_17 ,unemp_19)
post_df <- unemploy %>%
select(State, `2020-12-01`, `2022-12-01`) %>%
# Some is chr so we need to numeric
mutate(across(-State, ~ as.numeric(.))) %>%
mutate(unemp_post = `2022-12-01` - `2020-12-01`) %>%
rename(unemp_22 = `2022-12-01`, unemp_20 = `2020-12-01`) %>%
select(State, unemp_post, unemp_20, unemp_22)
unemploy2 <- pre_df %>%
inner_join(post_df, by = "State")
unemploy3 <- unemploy2 %>%
left_join(state_codes, by = c("State" = "State"))
unemploy_final <- unemploy3 %>%
select(Abbreviation, unemp_pre,unemp_17, unemp_19, unemp_post,unemp_20, unemp_22) %>%
rename(State = Abbreviation)
unemploy_final
## # A tibble: 52 × 7
## State unemp_pre unemp_17 unemp_19 unemp_post unemp_20 unemp_22
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AL -0.9 4 3.1 -2.2 4.5 2.3
## 2 AK -1.4 6.6 5.2 -3.1 7.1 4
## 3 AZ -0.200 4.9 4.7 -2.8 6.5 3.7
## 4 AR -0.300 3.8 3.5 -2 5 3
## 5 CA -0.400 4.5 4.1 -4.6 9 4.4
## 6 CO -0.100 2.9 2.8 -3.4 6.4 3
## 7 CT -0.600 4.3 3.7 -3.9 7.4 3.5
## 8 DE -0.6 4.2 3.6 -1.6 5.6 4
## 9 DC -0.5 5.9 5.4 -3.1 7.8 4.7
## 10 FL -1 4 3 -3.3 6.2 2.9
## # ℹ 42 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(unemploy_final, "data/data-clean/UNEMPLOY.csv", row.names = FALSE)
pce <- read.csv("data/SAPCE1__ALL_AREAS_1997_2023.csv")
pce_df <- pce %>%
filter(Description == "Personal consumption expenditures ") %>%
select(GeoName, X2017, X2019, X2020, X2022) %>%
rename(state = GeoName) %>%
filter(state != "United States") %>%
mutate(
pce_pre = ((X2019 - X2017) / X2017) * 100,
pce_post = ((X2022 - X2020) / X2020) * 100
)
pce_final <- pce_df %>%
left_join(state_codes, by = c("state" = "State")) %>%
select(Abbreviation, pce_pre, pce_post) %>%
rename(State = Abbreviation)
head(pce_final)
## State pce_pre pce_post
## 1 AL 8.277837 22.66264
## 2 AK 5.572225 23.88433
## 3 AZ 10.681811 27.98044
## 4 AR 6.486729 22.09850
## 5 CA 10.787929 25.65983
## 6 CO 11.319759 27.96054
# export to file but I will # because we already have sync folder with github
#write.csv(pce_final, "data/data-clean/PCE.csv", row.names = FALSE)
income <- read.csv("data/SAINC1__ALL_AREAS_1929_2024.csv")
income_df1 <- income %>%
filter(str_trim(Description) == "Per capita personal income (dollars) 2/") %>%
select(GeoName, X2017, X2019, X2020, X2022) %>%
rename(state = GeoName) %>%
filter(state != "United States") %>%
mutate(
income_pre = ((X2019 - X2017) / X2017) * 100,
income_post = ((X2022 - X2020) / X2020) * 100
)
income_final <- income_df1 %>%
left_join(state_codes, by = c("state" = "State")) %>%
select(Abbreviation, income_pre, income_post) %>%
rename(State = Abbreviation)
head(income_final)
## State income_pre income_post
## 1 AL 7.581065 12.688960
## 2 AK 7.086726 11.401225
## 3 AZ 10.317882 13.134632
## 4 AR 5.637273 17.407643
## 5 CA 10.315388 9.528295
## 6 CO 13.109126 18.359079
# export to file but I will # because we already have sync folder with github
#write.csv(income_final, "data/data-clean/INCOME.csv", row.names = FALSE)
gdp_df <- read.csv("data/SAGDP1__ALL_AREAS_1997_2024.csv")
gdp_real <- gdp_df %>%
filter(Description == "Real GDP (millions of chained 2017 dollars) 1/")
gdp_df2 <- gdp_real %>%
select(GeoName, X2017, X2019, X2020, X2022) %>%
rename(state = GeoName) %>%
mutate(
gdp_pre = ((X2019 - X2017) / X2017) * 100,
gdp_post = ((X2022 - X2020) / X2020) * 100
) %>%
filter(state != "United States")
gdp_final <- gdp_df2 %>%
left_join(state_codes, by = c("state" = "State")) %>%
select(Abbreviation, gdp_pre, gdp_post) %>%
rename(State = Abbreviation)
head(gdp_final)
## State gdp_pre gdp_post
## 1 AL 3.996621 7.3182724
## 2 AK -2.191186 0.8628568
## 3 AZ 7.948898 12.3828137
## 4 AR 2.694002 8.7421859
## 5 CA 8.358128 8.5462064
## 6 CO 9.570425 10.1607412
# export to file but I will # because we already have sync folder with github
#write.csv(gdp_final, "data/data-clean/GDP.csv", row.names = FALSE)
file_path <- "data/area_report_by_year.xlsx"
# There are many sheets so we should target it
# But auto has diffefent format so we will seperate it
target_sheets <- c(
"population",
"creditcard",
"mortgage",
"student loan",
"total",
"auto_delinq",
"creditcard_delinq",
"mortgage_delinq",
"studentloan_delinq"
)
# auto has different format so seperated
auto <- read_excel(file_path, sheet = "auto", skip = 4, col_names = TRUE)
#auto
for (sheet_name in target_sheets) {
df_name <- gsub(" ", "_", sheet_name)
# The real data start from 9 row in the xlxs file.
df <- read_excel(file_path, sheet = sheet_name, skip = 8, col_names = TRUE)
assign(df_name, df)
#print(df_name)
}
# After load the file, it's nice to check but I will make it # to make the document short.
#population
#auto
#creditcard
#mortgage
#student_loan
#total
#auto_delinq
#creditcard_delinq
#mortgage_delinq
#studentloan_delinq
#population
population_clean <- population %>%
filter(!is.na(state)) %>%
mutate(
population_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
population_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, population_pre, population_post)
# population_clean
creditcard_clean <- creditcard %>%
filter(!is.na(state)) %>%
mutate(
creditcard_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
creditcard_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, creditcard_pre, creditcard_post)
# creditcard_clean
auto_clean <- auto %>%
filter(!is.na(state)) %>%
mutate(
auto_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
auto_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, auto_pre, auto_post)
# auto_clean
mortgage_clean <- mortgage %>%
filter(!is.na(state)) %>%
mutate(
mortgage_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
mortgage_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, mortgage_pre, mortgage_post)
# mortgage_clean
studentloan_clean <- student_loan %>%
filter(!is.na(state)) %>%
mutate(
studentloan_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
studentloan_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, studentloan_pre, studentloan_post)
# studentloan_clean
total_clean <- total %>%
filter(!is.na(state)) %>%
mutate(
total_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
total_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, total_pre, total_post)
# total_clean
creditcard_delinq_clean <- creditcard_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
creditcard_delinq_pre = Q4_2019,
creditcard_delinq_post = Q4_2022
)
# creditcard_delinq_clean
auto_delinq_clean <- auto_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
auto_delinq_pre = Q4_2019,
auto_delinq_post = Q4_2022
)
# auto_delinq_clean
mortgage_delinq_clean <- mortgage_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
mortgage_delinq_pre = Q4_2019,
mortgage_delinq_post = Q4_2022
)
# mortgage_delinq_clean
studentloan_delinq_clean <- studentloan_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
studentloan_delinq_pre = Q4_2019,
studentloan_delinq_post = Q4_2022
)
# studentloan_delinq_clean
NYFED_df <- population_clean %>%
left_join(creditcard_clean, by = "state") %>%
left_join(auto_clean, by = "state") %>%
left_join(mortgage_clean, by = "state") %>%
left_join(studentloan_clean, by = "state") %>%
left_join(total_clean, by = "state") %>%
left_join(creditcard_delinq_clean, by = "state") %>%
left_join(auto_delinq_clean, by = "state") %>%
left_join(mortgage_delinq_clean, by = "state") %>%
left_join(studentloan_delinq_clean, by = "state")
head(NYFED_df)
## # A tibble: 6 × 21
## state population_pre population_post creditcard_pre creditcard_post auto_pre
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK -0.444 -1.15 3.98 13.6 -0.998
## 2 AL 1.58 1.22 7.59 15.5 7.74
## 3 AR 1.53 2.31 10 16.1 5.34
## 4 AZ 4.47 4.48 10.2 16.5 9.41
## 5 CA 1.57 1.87 11.4 19.8 7.53
## 6 CO 3.31 3.61 8.22 17.6 4.75
## # ℹ 15 more variables: auto_post <dbl>, mortgage_pre <dbl>,
## # mortgage_post <dbl>, studentloan_pre <dbl>, studentloan_post <dbl>,
## # total_pre <dbl>, total_post <dbl>, creditcard_delinq_pre <dbl>,
## # creditcard_delinq_post <dbl>, auto_delinq_pre <dbl>,
## # auto_delinq_post <dbl>, mortgage_delinq_pre <dbl>,
## # mortgage_delinq_post <dbl>, studentloan_delinq_pre <dbl>,
## # studentloan_delinq_post <dbl>
# Export final result if needed
#write.csv(NYFED_df, "data/data-clean/NYFED.csv", row.names = FALSE)
clean_gdp <- read_csv("data/data-clean/GDP.csv")
clean_income <- read_csv("data/data-clean/INCOME.csv")
clean_nyfed <- read_csv("data/data-clean/NYFED.csv")
clean_pce <- read_csv("data/data-clean/PCE.csv")
clean_unemploy <- read_csv("data/data-clean/UNEMPLOY.csv")
clean_housing<- read_csv("data/data-clean/HOUSING.csv")
#clean_gdp
#clean_income
#clean_nyfed
#clean_pce
#clean_unemploy
clean_nyfed <- clean_nyfed %>%
rename(State = state)
# Find common states
common_states <- Reduce(intersect, list(
clean_gdp$State,
clean_income$State,
clean_pce$State,
clean_nyfed$State,
clean_unemploy$State,
clean_housing$State
))
# Apply only common state
clean_gdp <- clean_gdp %>% filter(State %in% common_states)
clean_income <- clean_income %>% filter(State %in% common_states)
clean_pce <- clean_pce %>% filter(State %in% common_states)
clean_nyfed <- clean_nyfed %>% filter(State %in% common_states)
clean_unemploy <- clean_unemploy %>% filter(State %in% common_states)
clean_housing <- clean_housing %>% filter(State %in% common_states)
# Merge to one data set
merged_df <- clean_housing %>%
left_join(clean_gdp, by = "State") %>%
left_join(clean_income, by = "State") %>%
left_join(clean_pce, by = "State") %>%
left_join(clean_nyfed, by = "State") %>%
left_join(clean_unemploy, by = "State")
merged_df
## # A tibble: 51 × 37
## State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change gdp_pre
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AL 148993. 192427. 9.76 28.0 4.00
## 2 AK 292351. 335028. 7.66 12.3 -2.19
## 3 AZ 252522. 368418. 15.0 49.4 7.95
## 4 AR 140966. 175352. 7.83 26.1 2.69
## 5 CA 521016. 663241. 12.7 31.3 8.36
## 6 CO 375678. 486205. 12.6 32.8 9.57
## 7 CT 251600. 305414. 3.63 28.0 0.493
## 8 DE 262137. 318745. 5.58 27.9 7.29
## 9 DC 560797. 633072. 8.25 7.84 3.43
## 10 FL 228291. 308956. 13.2 45.6 6.90
## # ℹ 41 more rows
## # ℹ 31 more variables: gdp_post <dbl>, income_pre <dbl>, income_post <dbl>,
## # pce_pre <dbl>, pce_post <dbl>, population_pre <dbl>, population_post <dbl>,
## # creditcard_pre <dbl>, creditcard_post <dbl>, auto_pre <dbl>,
## # auto_post <dbl>, mortgage_pre <dbl>, mortgage_post <dbl>,
## # studentloan_pre <dbl>, studentloan_post <dbl>, total_pre <dbl>,
## # total_post <dbl>, creditcard_delinq_pre <dbl>, …
# export to file but I will # because we already have sync folder with github
# write.csv(merged_df, "data/data-clean/FINAL.csv", row.names = FALSE)
df<- read_csv("data/data-clean/FINAL.csv")
#df
#ggplot(covidstates, aes(x = severity, fill = severity)) +
# geom_bar() +
# labs(title = "Number of States by COVID Severity Level",
# x = "Severity Level", y = "Number of States") +
# scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = #"red", "Very high" = "brown")) +
# theme_minimal()
ggplot(covidstates, aes(x = severity, y = total_cases, fill = severity)) +
geom_boxplot() +
labs(title = "Distribution of Total Cases by Severity Level",
x = "Severity", y = "Total Cases") +
scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) + theme_minimal()
covidstates %>%
ggplot(aes(x = year, fill = severity)) +
geom_bar(position = "fill") +
labs(title = "Distribution of COVID Severity by Year",
y = "Proportion of States", x = "Year") +
scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) +
theme_minimal()
covidstates %>%
filter(State %in% c("CA", "TX", "NY", "IL", "AZ", "FL")) %>%
ggplot(aes(x = year, y = total_cases, color = State)) +
geom_line(size = 1.2) +
labs(title = "COVID Cases Over Time",
x = "Year", y = "Total Cases") +
theme_minimal()
state_abbreviations <- data.frame(
abbreviation = state.abb,
full_name = state.name
)
covidstatesFULLNAME <- covidstates %>%
left_join(state_abbreviations, by = c("State" = "abbreviation")) %>%
mutate(State = tolower(full_name))%>%
select(-full_name)
# map visualization code
us_states <- map_data("state")
us_states$region <- tolower(us_states$region)
# Merge the map data with the COVID data
covid_map_data <- us_states %>%
left_join(covidstatesFULLNAME, by = c("region" = "State")) %>%
filter(!is.na(severity))
ggplot(covid_map_data, aes(x = long, y = lat, group = group, fill = severity)) +
geom_polygon(color = "black") +
coord_fixed(1.1) +
labs(title = "COVID Case Severity by State",
fill = "Severity") +
scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high"= "brown")) +
theme_void() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# covidstatesFULLNAME
# covidstates
| Variable Name | Formula or Definition | Description |
|---|---|---|
| population_pre | ((Population_2019 - Population_2017) / Population_2017) × 100 | Population growth rate (Pre-COVID) |
| population_post | ((Population_2022 - Population_2020) / Population_2020) × 100 | Population growth rate (Post-COVID) |
merged_dfLC <- merged_df %>%
rename(state = State)
## PRE covid populations
plot_usmap(data = merged_dfLC, values = "population_pre", regions = "state") +
scale_fill_continuous(
name = "Population",
low = "lightgreen", high = "darkgreen", label = scales::comma
) +
labs(title = "Pre-COVID Population (2017–2019)") +
theme(legend.position = "right")
## POST covid populations
plot_usmap(data = merged_dfLC, values = "population_post", regions = "state") +
scale_fill_continuous(
name = "Population",
low = "lightblue", high = "darkblue", label = scales::comma
) +
labs(title = "Post-COVID Population (2020–2022)") +
theme(legend.position = "right")
### pre and post side by side
population_long <- merged_dfLC %>%
select(state, population_pre, population_post) %>%
pivot_longer(
cols = c(population_pre, population_post),
names_to = "period",
values_to = "population"
) %>%
mutate(
period = dplyr::recode(period,
"population_pre" = "Pre-COVID (2017–2019)",
"population_post" = "Post-COVID (2020–2022)"),
period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)"))
)
plot_usmap(data = population_long, values = "population", regions = "state") +
facet_wrap(~ period) +
scale_fill_continuous(
name = "Population",
low = "lightgreen", high = "darkgreen", label = scales::comma
) +
labs(title = "Population: Pre vs. Post COVID") +
theme(legend.position = "right")
| Variable Name | Formula or Definition | Description |
|---|---|---|
| gdp_pre | ((GDP_2019 - GDP_2017) / GDP_2017) × 100 | GDP growth rate (Pre-COVID) |
| gdp_post | ((GDP_2022 - GDP_2020) / GDP_2020) × 100 | GDP growth rate (Post-COVID) |
# compare GDP
df %>%
select(State, gdp_pre, gdp_post) %>%
ggplot()+
geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_pre
, fill = "GDP pre = ((X2019 - X2017) / X2017) * 100")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4)+
geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_post, fill ="GDP post = ((X2022 - X2020) / X2020) * 100")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4)+
labs(title = "Real GDP Growth: Pre vs Post COVID", y = "GDP Change (%)", x = "")+
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
| Variable Name | Formula or Definition | Description |
|---|---|---|
| income_pre | ((Income_2019 - Income_2017) / Income_2017) × 100 | Personal income growth (Pre-COVID) |
| income_post | ((Income_2022 - Income_2020) / Income_2020) × 100 | Personal income growth (Post-COVID) |
df %>%
select(State, income_pre, income_post) %>%
ggplot() +
geom_bar(mapping = aes(x = reorder(State, -income_post)
, y = income_pre
, fill = "Income pre = ((X2019 - X2017) / X2017) * 100")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(mapping = aes(x = reorder(State, -income_post)
, y = income_post
, fill = "Income post = ((X2022 - X2020) / X2020) * 100")
,stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Personal Income Growth: Pre vs Post COVID", y = "Income Change (%)", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
| Variable Name | Formula or Definition | Description |
|---|---|---|
| pce_pre | ((PCE_2019 - PCE_2017) / PCE_2017) × 100 | PCE growth rate (Pre-COVID) |
| pce_post | ((PCE_2022 - PCE_2020) / PCE_2020) × 100 | PCE growth rate (Post-COVID) |
df %>%
select(State, pce_pre, pce_post) %>%
ggplot() +
geom_bar(mapping = aes(x = reorder(State, -pce_post)
, y = pce_pre
, fill = "PCE pre = ((X2019 - X2017) / X2017) * 100")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(mapping = aes(x = reorder(State, -pce_post)
, y = pce_post
, fill = "PCE post = ((X2022 - X2020) / X2020) * 100")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Personal Consumption Expenditures Growth: Pre vs Post COVID",
y = "PCE Change (%)", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
| Variable Name | Formula or Definition | Description |
|---|---|---|
| unemp_pre | Unemp_2019 - Unemp_2017 | Change in unemployment rate (Pre-COVID) |
| unemp_post | Unemp_2022 - Unemp_2020 | Change in unemployment rate (Post-COVID) |
df %>%
select(State, unemp_pre, unemp_post) %>%
ggplot() +
geom_bar(aes(x = reorder(State, -unemp_post)
, y = unemp_pre,
fill = "Unemp pre = 2019 - 2017")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(aes(x = reorder(State, -unemp_post)
, y = unemp_post
, fill = "Unemp post = (2022 - 2020)")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Unemployment Rate Change: 2017–2019 vs 2020–2022", y = "Change in Unemployment Rate", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")
df_emp <- df %>%
select(State, unemp_17, unemp_19, unemp_20, unemp_22)
#df_emp
df_long <- df_emp %>%
pivot_longer(cols = starts_with("unemp_"), names_to = "Year", values_to = "Unemployment") %>%
mutate(Year = dplyr::recode(Year,
"unemp_17" = "2017",
"unemp_19" = "2019",
"unemp_20" = "2020",
"unemp_22" = "2022"))
#df_long
ggplot(df_long, aes(x = Year, y = Unemployment, group = State, color = State)) +
geom_line(size = 1) +
labs(title = "Unemployment Rate by State", y = "Unemployment Rate (%)", x = "Year") +
theme(legend.position = "none")
| Variable Name | Formula or Definition | Description |
|---|---|---|
| creditcard_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Credit card debt growth rate (Pre-COVID) |
| creditcard_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Credit card debt growth rate (Post-COVID) |
| creditcard_delinq_pre | Q4_2019 | Credit card delinquency rate (Pre-COVID) |
| creditcard_delinq_post | Q4_2022 | Credit card delinquency rate (Post-COVID) |
credit_df <- df %>%
select(State, creditcard_pre, creditcard_post) %>%
pivot_longer(cols = c(creditcard_pre, creditcard_post),
names_to = "Time", values_to = "Change") %>%
mutate(Time = factor(Time, levels = c("creditcard_pre", "creditcard_post")))
## credit_df
ggplot(credit_df, aes(x = State, y = Change, fill = Time)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.9) +
labs(title = "Change Rate in Credit Card Dept",
y = "Credit Card Dept Change Rate (%)", x = "")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "bottom")
credit_delinq_df <- df %>%
select(State, creditcard_delinq_pre, creditcard_delinq_post) %>%
pivot_longer(cols = c(creditcard_delinq_pre, creditcard_delinq_post),
names_to = "Time", values_to = "Change") %>%
mutate(Time = factor(Time, levels = c("creditcard_delinq_pre", "creditcard_delinq_post")))
#credit_delinq_df
gg_debt <- ggplot(credit_df, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.5) +
labs(title = "Credit Card Debt Change (%)", y = "Change Rate (%)", x = "")+
theme(legend.position = "bottom")
#gg_debt
gg_delinq <- ggplot(credit_delinq_df, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.5) +
labs(title = "Credit Card Delinquency Rate (%)", y = "Delinquency Rate (%)", x = "")+
theme(legend.position = "bottom")
#gg_delinq
grid.arrange(gg_debt, gg_delinq, ncol = 2)
| Variable Name | Formula or Definition | Description |
|---|---|---|
| mortgage_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Mortgage debt growth rate (Pre-COVID) |
| mortgage_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Mortgage debt growth rate (Post-COVID) |
| mortgage_delinq_pre | Q4_2019 | Mortgage delinquency rate (Pre-COVID) |
| mortgage_delinq_post | Q4_2022 | Mortgage delinquency rate (Post-COVID) |
#Percent Change in Mortgage Debt Balance per Capita (2017 → 2019)
qq1 <- ggplot(df, aes(x = "", y = mortgage_pre)) +
geom_boxplot(fill = "pink") +
labs(title = "Mortgage Debt per Capital -Pre", y = "Mortgage Dept Change Rate (%)")
# Percent Change in Mortgage Debt Balance per Capita (2020 → 2022)
qq2 <- ggplot(df, aes(x = "", y = mortgage_post)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Mortgage Debt per Capital - Post", y = "Mortgage Dept Change Rate (%)")
qq3 <- ggplot(df, aes(x = "", y = mortgage_delinq_pre)) +
geom_boxplot(fill = "orange") +
labs(title = "Mortgage Delinquency - Pre", y = "Delinquency Rate (%)")
qq4 <- ggplot(df, aes(x = "", y = mortgage_delinq_post)) +
geom_boxplot(fill = "purple") +
labs(title = "Mortgage Delinquency - Post", y = "Delinquency Rate (%)")
grid.arrange(qq1, qq2, qq3, qq4, ncol = 4)
| Variable Name | Formula or Definition | Description |
|---|---|---|
| studentloan_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Student loan debt growth rate (Pre-COVID) |
| studentloan_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Student loan debt growth rate (Post-COVID) |
| studentloan_delinq_pre | Q4_2019 | Student loan delinquency rate (Pre-COVID) |
| studentloan_delinq_post | Q4_2022 | Student loan delinquency rate (Post-COVID) |
df %>%
select(State, studentloan_delinq_pre, studentloan_delinq_post) %>%
ggplot() +
geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
, y = studentloan_delinq_pre
,fill = "Student Loan pre = Q4_2019")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
, y = studentloan_delinq_post
, fill = "Student Loan post = Q4_2022")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Student Loan Delinquency: Pre vs Post COVID", y = "Delinquency Rate (%)", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")
student_df <- df %>%
select(State, studentloan_pre, studentloan_post) %>%
pivot_longer(cols = c(studentloan_pre, studentloan_post), names_to = "Time", values_to = "Change") %>%
mutate(Time= ifelse(Time == "studentloan_pre", "Pre", "Post"))
#student_df
student_df$Time <- factor(student_df$Time, levels = c("Pre", "Post"))
gg_student <- ggplot(student_df, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.4) +
labs(title = "Student Loan Debt Change (%)", y = "Change Rate (%)", x = "") +
scale_fill_manual(values = c( "Pre" = "orange","Post" = "purple"))
#gg_student
#names(df)
student_df2 <- df %>%
select(State, studentloan_delinq_pre, studentloan_delinq_post) %>%
pivot_longer(cols = c(studentloan_delinq_pre, studentloan_delinq_post), names_to = "Time", values_to = "Change") %>%
mutate(Time= ifelse(Time == "studentloan_delinq_pre", "Pre", "Post"))
#student_df2
student_df2$Time <- factor(student_df2$Time, levels = c("Pre", "Post"))
gg_student2 <- ggplot(student_df2, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.4) +
labs(title = "Student Loan Debt Deliq Change (%)", y = "Change Rate (%)", x = "") +
scale_fill_manual(values = c( "Pre" = "yellow", "Post" = "red"))
#gg_student2
grid.arrange(gg_student, gg_student2, ncol = 2)
| Variable Name | Formula | Description |
|---|---|---|
| auto_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Auto loan debt growth rate (Pre-COVID) |
| auto_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Auto loan debt growth rate (Post-COVID) |
| auto_delinq_pre | Q4_2019 | Auto loan delinquency rate (Pre-COVID) |
| auto_delinq_post | Q4_2022 | Auto loan delinquency rate (Post-COVID) |
df_auto <- df %>%
select(auto_pre, auto_post, auto_delinq_pre, auto_delinq_post) %>%
pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>%
mutate(Category = factor(Category, levels = c("auto_pre", "auto_post", "auto_delinq_pre", "auto_delinq_post")))
ggplot(df_auto, aes(x = Category, y = Value, fill = Category)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplots of Auto Loan and Delinquency (Pre vs Post)",
x = NULL,
y = "Value") +
theme(axis.text.x = element_text(angle = 20, hjust = 1), legend.position = "bottom")
| Variable Name | Formula | Description |
|---|---|---|
| zhvi_post_change | ((ZHVI_2022 - ZHVI_2020) / ZHVI_2020) × 100 | Housing price change rate (Post-COVID) |
| zhvi_post_avg | (ZHVI_2020 + ZHVI_2021 + ZHVI_2022) / 3 | Average housing price during post-COVID years |
| zhvi_pre_change | ((ZHVI_2019 - ZHVI_2017) / ZHVI_2017) × 100 | Housing price change rate (Pre-COVID) |
| zhvi_pre_avg | (ZHVI_2017 + ZHVI_2018 + ZHVI_2019) / 3 | Average housing price during pre-COVID years |
merged_dfLC <- merged_df %>%
rename(state = State)
## heat map for PRE covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_pre_change", regions = "state") +
scale_fill_continuous(
name = "Zillow Home Value Change (%)",
low = "lightyellow", high = "red", label = scales::comma
) +
labs(title = "Pre-COVID Housing Price Change (2017–2019)") +
theme(legend.position = "right")
## heat map for POST covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_post_change", regions = "state") +
scale_fill_continuous(
name = "Zillow Home Value Change (%)",
low = "lightblue", high = "darkblue", label = scales::comma
) +
labs(title = "Post-COVID Housing Price Change (2020–2022)") +
theme(legend.position = "right")
### both pre and post, side by side
housing_long <- merged_dfLC %>%
select(state, zhvi_pre_change, zhvi_post_change) %>%
pivot_longer(
cols = c(zhvi_pre_change, zhvi_post_change),
names_to = "period",
values_to = "change"
) %>%
mutate(
period = dplyr::recode(period,
"zhvi_pre_change" = "Pre-COVID (2017–2019)",
"zhvi_post_change" = "Post-COVID (2020–2022)")) %>%
mutate(period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)")))
plot_usmap(data = housing_long, values = "change", regions = "state") +
facet_wrap(~ period) +
scale_fill_continuous(
name = "ZHVI Change (%)",
low = "lightyellow", high = "red", label = scales::comma
) +
labs(title = "Housing Price Change: Pre vs. Post COVID") +
theme(legend.position = "right")
housing_summary <- clean_housing %>%
select(State, zhvi_pre_avg, zhvi_post_avg)
# Run paired t-test
t_test_result <- t.test(housing_summary$zhvi_post_avg, housing_summary$zhvi_pre_avg,
paired = TRUE, alternative = "two.sided")
t_test_result
##
## Paired t-test
##
## data: housing_summary$zhvi_post_avg and housing_summary$zhvi_pre_avg
## t = 14.386, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 57966.91 76779.44
## sample estimates:
## mean difference
## 67373.18
mortgage_summary <- merged_df %>%
select(State, mortgage_pre, mortgage_post)
# Run paired t-test
t_test_result2 <- t.test(merged_df$mortgage_post,merged_df$mortgage_pre,
paired = TRUE, alternative = "two.sided")
t_test_result2
##
## Paired t-test
##
## data: merged_df$mortgage_post and merged_df$mortgage_pre
## t = 25.871, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 8.822871 10.308125
## sample estimates:
## mean difference
## 9.565498
income_summary <- merged_df %>%
select(State, income_pre, income_post)
# Run paired t-test
t_test_result3 <- t.test(merged_df$income_post,merged_df$income_pre,
paired = TRUE, alternative = "two.sided")
t_test_result3
##
## Paired t-test
##
## data: merged_df$income_post and merged_df$income_pre
## t = 10.794, df = 50, p-value = 1.156e-14
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 3.336458 4.862107
## sample estimates:
## mean difference
## 4.099282
# Correlation between housing prices pre and post
cor_zhvi <- cor.test(merged_df$zhvi_pre_avg, merged_df$zhvi_post_avg, method = "pearson")
cor_zhvi
##
## Pearson's product-moment correlation
##
## data: merged_df$zhvi_pre_avg and merged_df$zhvi_post_avg
## t = 36.948, df = 49, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9694276 0.9900366
## sample estimates:
## cor
## 0.9825222
ggplot(merged_df, aes(x = zhvi_pre_avg, y = zhvi_post_avg)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Correlation Between Pre and Post COVID Housing Prices",
subtitle = "Strong positive correlation between ZHVI pre- and post-pandemic",
x = "Pre-COVID Average Housing Prices (ZHVI)",
y = "Post-COVID Average Housing Prices (ZHVI)"
) +
theme_minimal()
cor_mort <- cor.test(merged_df$mortgage_pre, merged_df$mortgage_post, method = "pearson")
cor_mort
##
## Pearson's product-moment correlation
##
## data: merged_df$mortgage_pre and merged_df$mortgage_post
## t = 5.7317, df = 49, p-value = 6.045e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4336644 0.7739861
## sample estimates:
## cor
## 0.6335339
ggplot(merged_df, aes(x = mortgage_pre, y = mortgage_post)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Correlation Between Pre and Post COVID Mortgage",
x = "Pre-COVID Average Mortgage",
y = "Post-COVID Average Mortgage"
) +
theme_minimal()
cor_income <- cor.test(merged_df$income_pre, merged_df$income_post, method = "pearson")
cor_income
##
## Pearson's product-moment correlation
##
## data: merged_df$income_pre and merged_df$income_post
## t = 2.3517, df = 49, p-value = 0.02275
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04700942 0.54612103
## sample estimates:
## cor
## 0.3184673
ggplot(merged_df, aes(x = income_pre, y = income_post)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Correlation Between Pre and Post COVID Income",
x = "Pre-COVID Average Income",
y = "Post-COVID Average Income"
) +
theme_minimal()
# Multiple linear regression model
model_multiple <- lm(zhvi_post_avg ~ zhvi_pre_avg + gdp_post + income_post + population_post, data = merged_df)
# View summary
summary(model_multiple)
##
## Call:
## lm(formula = zhvi_post_avg ~ zhvi_pre_avg + gdp_post + income_post +
## population_post, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31408 -9558 -1851 10153 52293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.288e+04 1.610e+04 -0.800 0.42755
## zhvi_pre_avg 1.201e+00 2.304e-02 52.111 < 2e-16 ***
## gdp_post 2.692e+03 1.073e+03 2.508 0.01575 *
## income_post -9.186e+02 1.073e+03 -0.856 0.39625
## population_post 9.310e+03 2.779e+03 3.350 0.00162 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16620 on 46 degrees of freedom
## Multiple R-squared: 0.9854, Adjusted R-squared: 0.9841
## F-statistic: 774.8 on 4 and 46 DF, p-value: < 2.2e-16
# Multiple linear regression model
model_multiple <- lm(zhvi_post_avg ~ zhvi_pre_avg + gdp_post + population_post, data = merged_df)
# View summary
summary(model_multiple)
##
## Call:
## lm(formula = zhvi_post_avg ~ zhvi_pre_avg + gdp_post + population_post,
## data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32313 -10163 -790 11831 52531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.489e+04 7.881e+03 -3.158 0.00277 **
## zhvi_pre_avg 1.205e+00 2.243e-02 53.734 < 2e-16 ***
## gdp_post 2.975e+03 1.018e+03 2.922 0.00533 **
## population_post 8.007e+03 2.319e+03 3.453 0.00118 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16580 on 47 degrees of freedom
## Multiple R-squared: 0.9851, Adjusted R-squared: 0.9842
## F-statistic: 1039 on 3 and 47 DF, p-value: < 2.2e-16
df_stat <- read_csv("data/data-clean/FINAL.csv")
# df_stat
| Variable | ANOVA p-value | Post-Hoc Interpretation | Shapiro p-value | Levene p-value |
|---|---|---|---|---|
| gdp_post | 2.40e-08 | All three groups significantly different from each other | 0.798 | 0.273 |
| income_post | 0.039 | Low group < Medium & High groups (no difference between Medium & High) | 0.299 | 0.344 |
| unemp_post | 0.122 | No statistically significant differences between groups | 0.679 | 0.605 |
| total_post | 0.00016 | Differences exist; Post-hoc details required (likely Low vs High) | 0.401 | 0.138 |
| creditcard_post | 0.333 | No statistically significant differences between groups | 0.873 | 0.188 |
df_stat$pop_group <- cut(df_stat$population_post,
breaks = quantile(df_stat$population_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Small", "Medium", "Large"),
include.lowest = TRUE)
pop_anova <- aov(zhvi_post_change ~ pop_group, data = df_stat)
summary(pop_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## pop_group 2 2412 1206.2 25.39 3.01e-08 ***
## Residuals 48 2280 47.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(pop_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $pop_group
## diff lwr.ci upr.ci pval
## Medium-Small 4.031883 -0.7216142 8.78538 0.0946 .
## Large-Small 16.181339 11.4278415 20.93484 1.3e-08 ***
## Large-Medium 12.149456 7.3959584 16.90295 5.0e-06 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(df_stat, aes(x = pop_group, y = zhvi_post_change, fill = pop_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Post-COVID Housing Price Change by Population Group",
x = "Population Group",
y = "zhvi_post_change")
# residual analysis to check the assumptions
shapiro.test(pop_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: pop_anova$residuals
## W = 0.96771, p-value = 0.1774
# Levene's test for homogeneity of variance
leveneTest(zhvi_post_change ~ factor(pop_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.0936 0.3432
## 48
pop_group_table <- df_stat %>%
select(State, population_post, zhvi_post_change, pop_group)
datatable(pop_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Population Group (Small / Medium / Large)")
df_stat$income_group <- cut(df_stat$income_post,
breaks = quantile(df_stat$income_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
income_anova <- aov(zhvi_post_change ~ income_group, data = df_stat)
summary(income_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## income_group 2 593 296.7 3.474 0.039 *
## Residuals 48 4099 85.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(income_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $income_group
## diff lwr.ci upr.ci pval
## Medium-Low 7.6532966 1.2800167 14.026577 0.0196 *
## High-Low 6.7293284 0.3560485 13.102608 0.0389 *
## High-Medium -0.9239682 -7.2972481 5.449312 0.7719
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(df_stat, aes(x = income_group, y = zhvi_post_change, fill = income_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Income Group",
x = "Income Group",
y = "zhvi_post_change")
# residual analysis to check the assumptions
shapiro.test(income_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: income_anova$residuals
## W = 0.98935, p-value = 0.9254
# Levene's test for homogeneity of variance
leveneTest(zhvi_post_change ~ factor(income_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.0908 0.3441
## 48
income_group_table <- df_stat %>%
select(State, income_post, zhvi_post_change, income_group)
datatable(income_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Income Group (Low / Medium / High)")
df_stat$gdp_group <- cut(df_stat$gdp_post,
breaks = quantile(df_stat$gdp_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
gdp_anova <- aov(zhvi_post_change ~ gdp_group, data = df_stat)
summary(gdp_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## gdp_group 2 2434 1216.9 25.86 2.4e-08 ***
## Residuals 48 2259 47.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(gdp_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $gdp_group
## diff lwr.ci upr.ci pval
## Medium-Low 5.17528 0.444203 9.906358 0.0327 *
## High-Low 16.53958 11.808498 21.270653 6.6e-09 ***
## High-Medium 11.36430 6.633218 16.095372 1.4e-05 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(gdp_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdp_anova$residuals
## W = 0.98001, p-value = 0.5392
leveneTest(zhvi_post_change ~ factor(gdp_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.3341 0.273
## 48
ggplot(df_stat, aes(x = gdp_group, y = zhvi_post_change, fill = gdp_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by GDP Group",
x = "GDP Group",
y = "zhvi_post_change")
gdp_group_table <- df_stat %>%
select(State, gdp_post, zhvi_post_change, gdp_group)
datatable(gdp_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by GDP Group (Low / Medium / High)")
df_stat$unemp_group <- cut(df_stat$unemp_post,
breaks = quantile(df_stat$unemp_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
unemp_anova <- aov(zhvi_post_change ~ unemp_group, data = df_stat)
summary(unemp_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## unemp_group 2 394 197.17 2.202 0.122
## Residuals 48 4298 89.55
PostHocTest(unemp_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $unemp_group
## diff lwr.ci upr.ci pval
## Medium-Low 4.306174 -2.1287038 10.741051 0.1848
## High-Low 6.813317 0.1859737 13.440661 0.0441 *
## High-Medium 2.507144 -4.0303336 9.044621 0.4444
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(unemp_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: unemp_anova$residuals
## W = 0.97346, p-value = 0.3061
leveneTest(zhvi_post_change ~ factor(unemp_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.5078 0.605
## 48
ggplot(df_stat, aes(x = unemp_group, y = zhvi_post_change, fill = unemp_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Unemployment Group",
x = "Unemployment Group",
y = "zhvi_post_change")
unemp_group_table <- df_stat %>%
select(State, unemp_post, zhvi_post_change, unemp_group)
datatable(unemp_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Unemployment Group (Low / Medium / High)")
df_stat$total_group <- cut(df_stat$total_post,
breaks = quantile(df_stat$total_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
total_anova <- aov(zhvi_post_change ~ total_group, data = df_stat)
summary(total_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## total_group 2 1435 717.6 10.57 0.000157 ***
## Residuals 48 3258 67.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(total_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $total_group
## diff lwr.ci upr.ci pval
## Medium-Low -0.1750897 -5.856411 5.506232 0.95085
## High-Low 11.1646443 5.483323 16.845966 0.00025 ***
## High-Medium 11.3397340 5.658412 17.021056 0.00021 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(total_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: total_anova$residuals
## W = 0.94454, p-value = 0.01876
leveneTest(zhvi_post_change ~ factor(total_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.0672 0.1377
## 48
ggplot(df_stat, aes(x = total_group, y = zhvi_post_change, fill = total_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Total Debt Group",
x = "Total Debt Group",
y = "zhvi_post_change")
total_group_table <- df_stat %>%
select(State, total_post, zhvi_post_change, total_group)
datatable(total_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Total Debt Group (Low / Medium / High)")
df_stat$creditcard_group <- cut(df_stat$creditcard_post,
breaks = quantile(df_stat$creditcard_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
creditcard_anova <- aov(zhvi_post_change ~ creditcard_group, data = df_stat)
summary(creditcard_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## creditcard_group 2 210 104.99 1.124 0.333
## Residuals 48 4483 93.39
PostHocTest(creditcard_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $creditcard_group
## diff lwr.ci upr.ci pval
## Medium-Low 4.930728 -1.733912 11.595367 0.1434
## High-Low 3.007060 -3.657579 9.671699 0.3688
## High-Medium -1.923668 -8.588307 4.740972 0.5644
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(creditcard_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: creditcard_anova$residuals
## W = 0.98949, p-value = 0.9295
leveneTest(zhvi_post_change ~ factor(creditcard_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.7293 0.1883
## 48
ggplot(df_stat, aes(x = creditcard_group, y = zhvi_post_change, fill = creditcard_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Credit Card Debt Group",
x = "Credit Card Debt Group",
y = "zhvi_post_change")
credit_group_table <- df_stat %>%
select(State, creditcard_post, zhvi_post_change, creditcard_group)
datatable(credit_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Credit Card Debt Group (Low / Medium / High)")
tree_model <- rpart(zhvi_post_change ~ zhvi_post_avg + population_post + gdp_post + income_post + unemp_post + total_post + creditcard_post
,data = df_stat
,method = "anova"
,control = rpart.control(cp = 0.0001,minsplit = 10, minbucket = 4)) # it's not class because our house price is
rpart.plot(tree_model,
extra = 101,
fallen.leaves = TRUE,
main = "Decision Tree")
tree_model$variable.importance
## gdp_post population_post total_post creditcard_post income_post
## 2899.6427 2486.3110 1887.3291 836.0229 485.0208
## zhvi_post_avg unemp_post
## 309.8245 182.1748
node_assignments <- tree_model$where
tree_nodes <- data.frame(State = df_stat$State, Node = node_assignments)
#tree_nodes
datatable(tree_nodes,
filter = "top",
caption = "state")
clusters <- c("population_post", "gdp_post", "income_post", "unemp_post", "total_post", "zhvi_post_avg")
cluster_z <- scale(df_stat[, clusters] )
set.seed(612)
kmeans <- kmeans(cluster_z, centers = 5)
#kmeans
df_stat$cluster <- as.factor(kmeans$cluster)
ggplot(df_stat, aes(x = cluster, y = zhvi_post_change, fill = cluster)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Cluster for Post COVID",
x = "Cluster Group",
y = "zhvi_post_change")
cluster_nodes <- data.frame(State = df_stat$State, Cluster = df_stat$cluster)
#cluster_nodes
split(cluster_nodes$State, cluster_nodes$Cluster)
## $`1`
## [1] "CA" "DC" "HI" "MA"
##
## $`2`
## [1] "AL" "GA" "IN" "KS" "KY" "ME" "MN" "MO" "NH" "OR" "SC" "VT" "VA" "WI"
##
## $`3`
## [1] "AK" "CT" "IL" "LA" "MD" "MI" "MS" "NJ" "NM" "NY" "OH" "PA" "RI" "WV"
##
## $`4`
## [1] "AZ" "CO" "FL" "ID" "MT" "NV" "NC" "TN" "TX" "UT" "WA"
##
## $`5`
## [1] "AR" "DE" "IA" "NE" "ND" "OK" "SD" "WY"
cluster_summary <- aggregate(df_stat[, clusters], by = list(Cluster = df_stat$cluster), FUN = mean)
cluster_summary[ , -1] <- round(cluster_summary[ , -1], 2)
print(cluster_summary)
## Cluster population_post gdp_post income_post unemp_post total_post
## 1 1 1.25 7.76 10.07 -4.65 13.62
## 2 2 2.24 8.05 12.35 -1.89 11.48
## 3 3 1.19 6.02 10.49 -3.55 11.12
## 4 4 4.13 11.76 14.58 -2.51 16.86
## 5 5 2.80 5.51 16.95 -1.64 11.73
## zhvi_post_avg
## 1 628886.4
## 2 265625.4
## 3 266754.1
## 4 366580.3
## 5 229319.0
clusters2 <- c("population_post", "income_post", "total_post", "zhvi_post_avg")
cluster_z2 <- scale(df_stat[, clusters2])
set.seed(612)
kmeans2 <- kmeans(cluster_z2, centers = 6)
df_stat$cluster2 <- as.factor(kmeans2$cluster)
ggplot(df_stat, aes(x = cluster2, y = zhvi_post_change, fill = cluster2)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Clustering: GDP + Income + Population",
x = "Cluster",
y = "Housing Price Change (%)")
cluster_nodes2 <- data.frame(State = df_stat$State, Cluster = df_stat$cluster2)
split(cluster_nodes2$State, cluster_nodes2$Cluster)
## $`1`
## [1] "AR" "IA" "NE" "ND" "OK" "SD" "WY"
##
## $`2`
## [1] "AZ" "DE" "FL" "GA" "MT" "NV" "NC" "SC" "TN" "TX"
##
## $`3`
## [1] "CT" "MD" "MA" "NH" "NJ" "NY" "PA" "RI" "VT"
##
## $`4`
## [1] "CO" "ID" "UT"
##
## $`5`
## [1] "CA" "DC" "HI" "OR" "WA"
##
## $`6`
## [1] "AL" "AK" "IL" "IN" "KS" "KY" "LA" "ME" "MI" "MN" "MS" "MO" "NM" "OH" "VA"
## [16] "WV" "WI"
cluster_summary2 <- aggregate(df_stat[, clusters2], by = list(Cluster = df_stat$cluster2), FUN = mean)
cluster_summary2[ , -1] <- round(cluster_summary2[ , -1], 2)
print(cluster_summary2)
## Cluster population_post income_post total_post zhvi_post_avg
## 1 1 2.51 17.21 12.12 216543.8
## 2 2 4.14 13.60 14.35 299750.4
## 3 3 1.91 9.70 10.57 359597.0
## 4 4 4.54 16.79 19.66 446324.5
## 5 5 1.61 10.90 15.33 593770.3
## 6 6 1.35 12.19 11.23 222252.1